STRATAA microbiome
analysis
Imports
library(rbiom)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(ggforce)
library(patchwork)
library(vegan)
library(dplyr)
library(forcats)
Sources
The file handles are set in config.R as they’re used by both this
script and data_cleaning.
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/core_functions.R")
source("/Users/flashton/Dropbox/GordonGroup/STRATAA_Microbiome/from_Leo/Leonardos_analysis/bin/config.R")
Metadata basics
First do the plots with kruskall wallis comparison between groups to
see if there’s an overall difference, then do the pairwise tests because
KW says there is a difference.
metadata <- read_metadata(metadata_handle)
metadata <- metadata %>% mutate(Group = if_else(Group == 'Control_HealthySerosurvey', 'Healthy Control', Group))
metadata <- metadata %>% mutate(Group = if_else(Group == 'Acute_Typhi', 'Acute typhoid', Group))
number_per_country <- metadata %>% group_by(Country) %>% summarise(count = n())
number_per_country %>% kbl() %>% kable_styling()
|
Country
|
count
|
|
Bangladesh
|
120
|
|
Malawi
|
114
|
|
Nepal
|
76
|
print(sum(number_per_country$count))
## [1] 310
metadata %>% group_by(Group) %>% summarise(count = n()) %>% kbl() %>% kable_styling()
|
Group
|
count
|
|
Acute typhoid
|
103
|
|
Carrier
|
110
|
|
Healthy Control
|
97
|
baseline_chars <- get_baseline_characteristics(metadata)
baseline_chars$pct_anti
## [1] 37.50000 0.00000 0.00000 26.47059 0.00000 0.00000 44.82759 0.00000
## [9] 0.00000
baseline_chars$pct_anti %>% na_if(0)
## [1] 37.50000 NA NA 26.47059 NA NA 44.82759 NA
## [9] NA
# BEWARE - na_if(0.0) will convert ALL 0s to NAs, this is ok as they're only in the antibiotics of carriers and controls
# at the moment, but if others get added in, will screw with it.
# baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number') %>% na_if(0.0) %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
# na_if stopped working, so had to do this instead
baseline_chars_table <- baseline_chars %>% rename(`Median age` = median_age, `Women (%)` = pct_fem, `Antibiotics in last 2 weeks (%)` = pct_anti) %>% pivot_longer(!c(Country, Group)) %>% rename(variable_name = name) %>% pivot_wider(names_from = Group, values_from = value) %>% filter(variable_name != 'number')
baseline_chars_table$Carrier <- replace(baseline_chars_table$Carrier,which(baseline_chars_table$Carrier==0),NA)
baseline_chars_table$`Healthy Control` <- replace(baseline_chars_table$`Healthy Control`,which(baseline_chars_table$`Healthy Control`==0),NA)
baseline_chars_table %>% kbl(digits = c(NA, NA, 1, 1, 1)) %>% kable_styling()
|
Country
|
variable_name
|
Acute typhoid
|
Carrier
|
Healthy Control
|
|
Bangladesh
|
Median age
|
6.0
|
37.0
|
28.5
|
|
Bangladesh
|
Women (%)
|
47.5
|
47.5
|
65.0
|
|
Bangladesh
|
Antibiotics in last 2 weeks (%)
|
37.5
|
NA
|
NA
|
|
Malawi
|
Median age
|
9.7
|
32.0
|
24.0
|
|
Malawi
|
Women (%)
|
64.7
|
57.5
|
65.0
|
|
Malawi
|
Antibiotics in last 2 weeks (%)
|
26.5
|
NA
|
NA
|
|
Nepal
|
Median age
|
17.2
|
43.9
|
35.0
|
|
Nepal
|
Women (%)
|
24.1
|
66.7
|
82.4
|
|
Nepal
|
Antibiotics in last 2 weeks (%)
|
44.8
|
NA
|
NA
|
ggplot(metadata, aes(x = Country, y = Age, fill = Group)) + geom_boxplot() + stat_compare_means(method = 'kruskal.test', label = "p")

comparisons_groups <- list(c("Acute typhoid", "Carrier"), c("Acute typhoid", "Healthy Control"), c("Carrier", "Healthy Control"))
comparisons_countries <- list(c('Bangladesh', 'Malawi'), c('Bangladesh', 'Nepal'), c('Malawi', 'Nepal'))
# default stat method when doing pairwise tests in wilcoxon.
ggboxplot(metadata, facet.by = "Country", y = "Age", x = "Group", color = "Group") + stat_compare_means(comparisons = comparisons_groups) + rremove("x.text") + rremove("xlab") + rremove("x.ticks") # +rotate_x_text(angle = 45)

ggboxplot(metadata, facet.by = "Group", y = "Age", x = "Country", color = "Country") + stat_compare_means(comparisons = comparisons_countries) + rotate_x_text(angle = 45)

country_group_sex <- metadata %>% group_by(Country, Group, Sex) %>% summarise(count = n())
plot_sex <- function(eg1, c){
d <- eg1 %>% filter(Country == c)
p <- ggplot(d, aes(x = Group, y = count, fill = Sex)) +
geom_bar(stat ='identity', position = 'fill') +
ylab('Proportion') +
ggtitle(c)# +
#theme(axis.text=element_text(size=34), axis.title=element_text(size=36,face="bold"), plot.title = element_text(size = 40, face = "bold"), legend.key.size = unit(4, 'cm'), legend.title = element_text(size = 34), legend.text = element_text(size = 28))
return(p)
}
m_sex <- plot_sex(country_group_sex, 'Malawi')
b_sex <- plot_sex(country_group_sex, 'Bangladesh')
n_sex <- plot_sex(country_group_sex, 'Nepal')
m_sex / b_sex / n_sex

Alpha diversity -
metaphlan
Alpha diversity - all countries, all groups
all_countries_all_groups_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Carrier'), c('Acute_Typhi', 'Control_HealthySerosurvey'), c('Control_HealthySerosurvey', 'Carrier')))

all_countries_all_groups_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
|
rownames(alpha_anova_summary[[1]])
|
Df
|
Sum.Sq
|
Mean.Sq
|
F.value
|
Pr..F.
|
is_it_significant
|
|
Country:Group
|
4
|
7.83
|
1.96
|
12.4
|
3.09e-09
|
significant
|
|
Group
|
2
|
5.92
|
2.96
|
18.7
|
2.55e-08
|
significant
|
|
Country
|
2
|
2.75
|
1.38
|
8.7
|
0.00022
|
significant
|
|
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
1.83
|
0.913
|
5.77
|
0.00352
|
significant
|
|
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
1.25
|
0.625
|
3.95
|
0.0204
|
not_significant
|
|
Sex:Group
|
2
|
1.13
|
0.566
|
3.58
|
0.0292
|
not_significant
|
|
Country:Sex
|
2
|
1.11
|
0.553
|
3.49
|
0.0318
|
not_significant
|
|
Country:Sex:Age
|
2
|
1.03
|
0.517
|
3.27
|
0.0396
|
not_significant
|
|
Country:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.835
|
0.417
|
2.64
|
0.0733
|
not_significant
|
|
Country:Age
|
2
|
0.576
|
0.288
|
1.82
|
0.164
|
not_significant
|
|
Country:Group:Age
|
4
|
1.03
|
0.256
|
1.62
|
0.169
|
not_significant
|
|
Sex:Age
|
1
|
0.298
|
0.298
|
1.88
|
0.171
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.474
|
0.237
|
1.5
|
0.226
|
not_significant
|
|
Country:Sex:Group
|
4
|
0.817
|
0.204
|
1.29
|
0.274
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.318
|
0.159
|
1.01
|
0.367
|
not_significant
|
|
Group:Age
|
2
|
0.239
|
0.119
|
0.754
|
0.471
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.0529
|
0.0529
|
0.334
|
0.564
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.0476
|
0.0476
|
0.301
|
0.584
|
not_significant
|
|
Sex
|
1
|
0.0294
|
0.0294
|
0.186
|
0.667
|
not_significant
|
|
Sex:Group:Age
|
2
|
0.051
|
0.0255
|
0.161
|
0.851
|
not_significant
|
|
Country:Sex:Group:Age
|
4
|
0.127
|
0.0317
|
0.201
|
0.938
|
not_significant
|
|
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.000136
|
0.000136
|
0.000861
|
0.977
|
not_significant
|
|
Age
|
1
|
1.07e-08
|
1.07e-08
|
6.75e-08
|
1
|
not_significant
|
|
Residuals
|
261
|
41.3
|
0.158
|
NA
|
NA
|
NA
|
all_countries_all_groups_alpha$alpha_plot_group

all_countries_all_groups_alpha$alpha_plot_antibiotics

Alpha diversity - all countries, healthy and acute
all_countries_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Bangladesh', 'Malawi', 'Nepal'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

all_countries_healthy_acute_alpha$alpha_by_country

all_countries_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
|
rownames(alpha_anova_summary[[1]])
|
Df
|
Sum.Sq
|
Mean.Sq
|
F.value
|
Pr..F.
|
is_it_significant
|
|
Country
|
2
|
5.74
|
2.87
|
17.4
|
1.36e-07
|
significant
|
|
Country:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
1.68
|
0.842
|
5.12
|
0.00699
|
significant
|
|
Sex:Age
|
1
|
1.07
|
1.07
|
6.5
|
0.0117
|
not_significant
|
|
Country:Sex
|
2
|
1.5
|
0.75
|
4.56
|
0.0119
|
not_significant
|
|
Country:Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
1.39
|
0.693
|
4.21
|
0.0165
|
not_significant
|
|
Country:Age
|
2
|
1.02
|
0.51
|
3.1
|
0.0477
|
not_significant
|
|
Sex:Group
|
1
|
0.628
|
0.628
|
3.82
|
0.0525
|
not_significant
|
|
Country:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.816
|
0.408
|
2.48
|
0.0871
|
not_significant
|
|
Country:Group
|
2
|
0.698
|
0.349
|
2.12
|
0.123
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.618
|
0.309
|
1.88
|
0.157
|
not_significant
|
|
Group
|
1
|
0.164
|
0.164
|
0.997
|
0.319
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
2
|
0.359
|
0.179
|
1.09
|
0.339
|
not_significant
|
|
Sex
|
1
|
0.0592
|
0.0592
|
0.36
|
0.55
|
not_significant
|
|
Country:Sex:Age
|
2
|
0.188
|
0.0938
|
0.57
|
0.567
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.0511
|
0.0511
|
0.31
|
0.578
|
not_significant
|
|
Sex:Group:Age
|
1
|
0.0368
|
0.0368
|
0.224
|
0.637
|
not_significant
|
|
Group:Age
|
1
|
0.0147
|
0.0147
|
0.0891
|
0.766
|
not_significant
|
|
Country:Group:Age
|
2
|
0.0713
|
0.0357
|
0.217
|
0.805
|
not_significant
|
|
Age
|
1
|
0.00772
|
0.00772
|
0.0469
|
0.829
|
not_significant
|
|
Country:Sex:Group
|
2
|
0.0595
|
0.0297
|
0.181
|
0.835
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.00675
|
0.00675
|
0.041
|
0.84
|
not_significant
|
|
Country:Sex:Group:Age
|
2
|
0.05
|
0.025
|
0.152
|
0.859
|
not_significant
|
|
Country:Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.000136
|
0.000136
|
0.000828
|
0.977
|
not_significant
|
|
Residuals
|
163
|
26.8
|
0.165
|
NA
|
NA
|
NA
|
all_countries_healthy_acute_alpha$alpha_plot_group

all_countries_healthy_acute_alpha$alpha_plot_antibiotics

alpha diversity, malawi only
malawi_healthy_acute_alpha <- metaphlan_alpha(strataa_metaphlan_data, strataa_metaphlan_metadata, countries_of_interest = c('Malawi'), groups_of_interest = c('Acute_Typhi', 'Control_HealthySerosurvey'), comparisons = list(c('Acute_Typhi', 'Control_HealthySerosurvey')))

malawi_healthy_acute_alpha$alpha_anova_summary_with_var_names %>% dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>% kbl() %>% kable_styling()
|
rownames(alpha_anova_summary[[1]])
|
Df
|
Sum.Sq
|
Mean.Sq
|
F.value
|
Pr..F.
|
is_it_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
2.27
|
2.27
|
16
|
0.000167
|
significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
1.31
|
1.31
|
9.27
|
0.0034
|
significant
|
|
Sex
|
1
|
1.16
|
1.16
|
8.21
|
0.00567
|
significant
|
|
Group
|
1
|
0.84
|
0.84
|
5.93
|
0.0177
|
not_significant
|
|
Age
|
1
|
0.451
|
0.451
|
3.19
|
0.079
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
1
|
0.406
|
0.406
|
2.87
|
0.0952
|
not_significant
|
|
Sex:Group
|
1
|
0.313
|
0.313
|
2.21
|
0.142
|
not_significant
|
|
Sex:Group:Age
|
1
|
0.0978
|
0.0978
|
0.691
|
0.409
|
not_significant
|
|
Sex:Age
|
1
|
0.0164
|
0.0164
|
0.116
|
0.734
|
not_significant
|
|
Group:Age
|
1
|
0.00122
|
0.00122
|
0.00864
|
0.926
|
not_significant
|
|
Residuals
|
63
|
8.91
|
0.141
|
NA
|
NA
|
NA
|
malawi_healthy_acute_alpha$alpha_plot_group

malawi_healthy_acute_alpha$alpha_plot_antibiotics

Beta diversity -
metaphlan
All groups.
all_countries_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
all_countries_beta_all_groups$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0759082
|
0.0009990
|
significant
|
|
Group:Age
|
0.0121511
|
0.0029970
|
significant
|
|
Sex:Group:Age
|
0.0081783
|
0.0909091
|
not_significant
|
|
Sex:Group
|
0.0079033
|
0.1238761
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0076947
|
0.1348651
|
not_significant
|
|
Age
|
0.0038070
|
0.1668332
|
not_significant
|
|
Sex
|
0.0035891
|
0.2147852
|
not_significant
|
|
Sex:Age
|
0.0035425
|
0.2227772
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0035541
|
0.2247752
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0027831
|
0.4545455
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0045854
|
0.7862138
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.8663032
|
NA
|
NA
|
bgd_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
bgd_beta_all_groups$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.2679584
|
0.0009990
|
significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0124851
|
0.0449550
|
not_significant
|
|
Sex:Age
|
0.0093987
|
0.1278721
|
not_significant
|
|
Age
|
0.0084979
|
0.1688312
|
not_significant
|
|
Sex:Group
|
0.0155821
|
0.1788212
|
not_significant
|
|
Sex
|
0.0078067
|
0.2357642
|
not_significant
|
|
Group:Age
|
0.0115137
|
0.5144855
|
not_significant
|
|
Sex:Group:Age
|
0.0104905
|
0.6063936
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0047215
|
0.6433566
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0044103
|
0.7022977
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0035306
|
0.8121878
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.6436043
|
NA
|
NA
|
# bgd_beta_all_groups$pc12
# bgd_beta_all_groups$pc34
mal_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
mal_beta_all_groups$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.1942771
|
0.0009990
|
significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0225358
|
0.0009990
|
significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0247209
|
0.0019980
|
significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0169857
|
0.0089910
|
significant
|
|
Group:Age
|
0.0223640
|
0.0209790
|
not_significant
|
|
Sex:Age
|
0.0138711
|
0.0229770
|
not_significant
|
|
Sex:Group
|
0.0217097
|
0.0269730
|
not_significant
|
|
Age
|
0.0114622
|
0.0489510
|
not_significant
|
|
Sex
|
0.0075054
|
0.2667333
|
not_significant
|
|
Sex:Group:Age
|
0.0129653
|
0.4355644
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.6516027
|
NA
|
NA
|
# mal_beta_all_groups$pc12
# mal_beta_all_groups$pc34
nep_beta_all_groups <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Carrier', 'Control_HealthySerosurvey'))
nep_beta_all_groups$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0898195
|
0.0009990
|
significant
|
|
Sex
|
0.0176933
|
0.1408591
|
not_significant
|
|
Sex:Age
|
0.0148693
|
0.2667333
|
not_significant
|
|
Age
|
0.0125478
|
0.4395604
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0124906
|
0.4835165
|
not_significant
|
|
Sex:Group
|
0.0239383
|
0.5904096
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0106433
|
0.6063936
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0228450
|
0.6373626
|
not_significant
|
|
Sex:Group:Age
|
0.0169339
|
0.9370629
|
not_significant
|
|
Group:Age
|
0.0162360
|
0.9750250
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0102948
|
1.0000000
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.7516884
|
NA
|
NA
|
all_countries_beta_all_groups$pc12 / bgd_beta_all_groups$pc12 / nep_beta_all_groups$pc12

Acute vs healthy.
all_countries_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh', 'Malawi', 'Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
all_countries_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0285252
|
0.0009990
|
significant
|
|
Group:Age
|
0.0117687
|
0.0109890
|
not_significant
|
|
Age
|
0.0092014
|
0.0419580
|
not_significant
|
|
Sex
|
0.0070879
|
0.1118881
|
not_significant
|
|
Sex:Group:Age
|
0.0072077
|
0.1288711
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0117438
|
0.2187812
|
not_significant
|
|
Sex:Age
|
0.0058394
|
0.2197802
|
not_significant
|
|
Sex:Group
|
0.0057923
|
0.2507493
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0055130
|
0.2817183
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0043096
|
0.4985015
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0070480
|
0.8281718
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.8959630
|
NA
|
NA
|
bgd_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Bangladesh'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
bgd_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0782718
|
0.0009990
|
significant
|
|
Sex
|
0.0230592
|
0.0339660
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0199564
|
0.0689311
|
not_significant
|
|
Sex:Age
|
0.0168558
|
0.1618382
|
not_significant
|
|
Age
|
0.0146689
|
0.2337662
|
not_significant
|
|
Group:Age
|
0.0109377
|
0.4715285
|
not_significant
|
|
Sex:Group:Age
|
0.0100500
|
0.5704296
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0080933
|
0.7372627
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0074906
|
0.8221778
|
not_significant
|
|
Sex:Group
|
0.0067483
|
0.9100899
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0059228
|
0.9200799
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.7979452
|
NA
|
NA
|
mal_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Malawi'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
mal_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.1995900
|
0.0009990
|
significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0381931
|
0.0009990
|
significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0371210
|
0.0019980
|
significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0267524
|
0.0069930
|
significant
|
|
Sex:Group
|
0.0200496
|
0.0239760
|
not_significant
|
|
Age
|
0.0194908
|
0.0429570
|
not_significant
|
|
Sex:Age
|
0.0144797
|
0.1248751
|
not_significant
|
|
Sex
|
0.0141068
|
0.1508492
|
not_significant
|
|
Sex:Group:Age
|
0.0091754
|
0.4685315
|
not_significant
|
|
Group:Age
|
0.0078947
|
0.5994006
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.6131466
|
NA
|
NA
|
nep_beta_acute_healthy <- metaphlan_beta(strataa_metaphlan_data, strataa_metaphlan_metadata, c('Nepal'), c('Acute_Typhi', 'Control_HealthySerosurvey'))
nep_beta_acute_healthy$pn_res %>% kbl %>% kable_styling()
|
|
R2
|
Pr(>F)
|
is_it_significant
|
|
Group
|
0.0598990
|
0.0039960
|
significant
|
|
Sex
|
0.0484279
|
0.0139860
|
not_significant
|
|
Sex:Group
|
0.0246755
|
0.2857143
|
not_significant
|
|
Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0450942
|
0.3516484
|
not_significant
|
|
Sex:Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0244159
|
0.3516484
|
not_significant
|
|
Sex:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0207951
|
0.4975025
|
not_significant
|
|
Sex:Group:Age
|
0.0175717
|
0.6633367
|
not_significant
|
|
Age
|
0.0154038
|
0.7792208
|
not_significant
|
|
Sex:Age
|
0.0148604
|
0.8621379
|
not_significant
|
|
Group:Age
|
0.0102689
|
0.9850150
|
not_significant
|
|
Age:Antibiotics_taken_before_sampling_yes_no_assumptions
|
0.0188359
|
0.9990010
|
not_significant
|
|
Total
|
1.0000000
|
NA
|
NA
|
|
Residual
|
0.6997518
|
NA
|
NA
|
all_countries_beta_acute_healthy$pc12 + bgd_beta_acute_healthy$pc12 + mal_beta_acute_healthy$pc12 + nep_beta_acute_healthy$pc12

Plot phyla bar
plots
# get the taxa that are phyla, not classes, or below (species etc), and tidy the data.
phyla <- strataa_metaphlan_data %>% mutate(clade_name = rownames(strataa_metaphlan_data)) %>% filter(grepl("p__", clade_name)) %>% filter(!grepl("c__", clade_name)) %>% pivot_longer(!c(clade_name, lowest_taxonomic_level), names_to = "sample", values_to = "relative_abundance")
# relative_abundance > 1 returns a list of TRUE/FALSE values, which is then summed to get the number of samples in which the phylum is present at > 1% relative abundance.
# then we filter to only keep phyla that are present at 1% in at least 10% of samples.
phyla_to_exclude <- phyla %>% group_by(clade_name) %>%
summarise(count = sum(relative_abundance > 1)) %>%
filter(count < (length(unique(phyla$sample)) / 10)) %>%
pull(clade_name)
View(phyla_to_exclude)
# in order to make each sample add up to 100, we need to add the excluded taxa back in as a single "rare taxa" phylum.
# first we need to calculate the relative abundance of the excluded taxa in each sample.
excluded_phyla <- phyla %>%
filter(clade_name %in% phyla_to_exclude) %>% group_by(sample) %>% summarise(relative_abundance = sum(relative_abundance))
# then make a column with the name "rare_taxa" for each sample, annd bind it to the excluded taxa data.
rare_taxa_column <- data.frame(lowest_taxonomic_level = c(rep("rare_taxa", nrow(excluded_phyla))), clade_name = c(rep("rare_taxa", nrow(excluded_phyla))))
excluded_phyla <- cbind(rare_taxa_column, excluded_phyla)
# then remove the excluded taxa from the phyla data.
phyla_clean <- phyla %>%
filter(!(clade_name %in% phyla_to_exclude))
# and add the excluded taxa back in.
phyla_clean <- rbind(phyla_clean, excluded_phyla)
View(phyla_clean)
View(excluded_phyla)
colnames(strataa_metaphlan_metadata)
## [1] "Group"
## [2] "Sex"
## [3] "Country"
## [4] "Age"
## [5] "Antibiotics_taken_before_sampling_yes_no_assumptions"
## [6] "sequencing_lane"
## [7] "SampleID"
## [8] "age_bracket"
metadata_select <- strataa_metaphlan_metadata %>% dplyr::select(SampleID, Group, Country)
phyla_clean_metadata <- phyla_clean %>% left_join(metadata_select, by = c("sample" = "SampleID"))
phyla_clean_metadata <- phyla_clean_metadata %>% mutate(Group = ifelse(Group == "Acute_Typhi", "Typhi", Group)) %>% mutate(Group = ifelse(Group == "Control_HealthySerosurvey", "Healthy", Group))
order_of_groups <- c("Typhi", "Healthy")
plot_per_country_abundance <- function(phyla_clean_metadata, country, group_order){
# thanks chatgpt!
phyla_clean_country <- phyla_clean_metadata %>% filter(Country == country) %>% filter(Group %in% group_order)
phyla_clean_country_fct <- phyla_clean_country %>%
mutate(Group = factor(Group, levels = group_order), # Convert group to a factor with the desired order
group_order_numeric = as.numeric(Group), # Create a new numeric variable based on the order of group
sample = fct_reorder(sample, group_order_numeric)) # Reorder sample based on group_order_numeric
p <- ggplot(data = phyla_clean_country_fct, aes(x = sample, y = relative_abundance, fill = clade_name)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1), axis.title.x = element_blank()) +
scale_x_discrete(breaks=phyla_clean_country_fct$sample, labels=phyla_clean_country_fct$Group) +
ggtitle(country) +
ylim(0, 100.01)
p
}
bangladesh_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Bangladesh", group_order = order_of_groups)
# bangladesh_phyla_plot
malawi_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Malawi", group_order = order_of_groups)
nepal_phyla_plot <- plot_per_country_abundance(phyla_clean_metadata = phyla_clean_metadata, country = "Nepal", group_order = order_of_groups)
bangladesh_phyla_plot / malawi_phyla_plot / nepal_phyla_plot

# going back to phyla so that get all the weird rare ones.
# need to join to meta-data
# remove carriers
# group by clade, group, country
# summarise to get the median.
# pivot wider to get the table.
# dplyr::select(order(colnames(.))) rearranges the columns in alphabetical order
# relocate moves the clade_name column to the first column.
phyla %>% left_join(metadata_select, by = c("sample" = "SampleID")) %>% filter(Group != 'Carrier') %>% group_by(clade_name, Group, Country) %>% summarise(median_prevalence = median(relative_abundance)) %>% pivot_wider(names_from = c('Country', 'Group'), values_from = 'median_prevalence') %>% dplyr::select(order(colnames(.))) %>% relocate(clade_name, .before = 1) %>% kbl() %>% kable_styling()
|
clade_name
|
Bangladesh_Acute_Typhi
|
Bangladesh_Control_HealthySerosurvey
|
Malawi_Acute_Typhi
|
Malawi_Control_HealthySerosurvey
|
Nepal_Acute_Typhi
|
Nepal_Control_HealthySerosurvey
|
|
k__Archaea|p__Candidatus_Thermoplasmatota
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Archaea|p__Euryarchaeota
|
0.000000
|
0.000000
|
0.249010
|
0.000000
|
0.00000
|
0.22474
|
|
k__Archaea|p__Thaumarchaeota
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Acidobacteria
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Actinobacteria
|
14.361185
|
13.136440
|
1.893725
|
0.524965
|
3.33634
|
3.81550
|
|
k__Bacteria|p__Bacteria_unclassified
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Bacteroidetes
|
0.911970
|
2.216730
|
15.361165
|
34.630185
|
28.24679
|
20.53695
|
|
k__Bacteria|p__Candidatus_Melainabacteria
|
0.000000
|
0.000000
|
0.038085
|
0.052210
|
0.00000
|
0.02157
|
|
k__Bacteria|p__Candidatus_Saccharibacteria
|
0.017310
|
0.015625
|
0.000000
|
0.000000
|
0.00000
|
0.00203
|
|
k__Bacteria|p__Chloroflexi
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Elusimicrobia
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Firmicutes
|
80.670075
|
72.161565
|
60.161350
|
58.311770
|
53.74090
|
60.17250
|
|
k__Bacteria|p__Fusobacteria
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Lentisphaerae
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Proteobacteria
|
0.629715
|
1.272295
|
6.606440
|
6.195335
|
2.90174
|
2.52509
|
|
k__Bacteria|p__Spirochaetes
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Synergistetes
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Bacteria|p__Tenericutes
|
0.000000
|
0.000000
|
0.002715
|
0.000000
|
0.01965
|
0.01362
|
|
k__Bacteria|p__Verrucomicrobia
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Eukaryota|p__Ascomycota
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
|
k__Eukaryota|p__Eukaryota_unclassified
|
0.000000
|
0.000000
|
0.000000
|
0.000000
|
0.00000
|
0.00000
|
Taxa associated with
participant groups
basics, stats and
plots
Maaslin basics
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
bangladesh_taxonomic_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'metaphlan')
malawi_taxonomic_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'metaphlan')
nepal_taxonomic_maaslin <- read_in_maaslin('Nepal', groups_to_analyse, nep_variables_for_analysis, 'metaphlan')
bangladesh_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(bangladesh_taxonomic_maaslin)
malawi_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(malawi_taxonomic_maaslin)
nepal_taxonomic_maaslin_filtered <- filter_taxonomic_maaslin(nepal_taxonomic_maaslin)
bangladesh_maaslin_stats <- basic_maaslin_stats(bangladesh_taxonomic_maaslin_filtered, 'Bangladesh', bang_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats <- basic_maaslin_stats(malawi_taxonomic_maaslin_filtered, 'Malawi', mwi_variables_for_analysis, groups_to_analyse)
nepal_maaslin_stats <- basic_maaslin_stats(nepal_taxonomic_maaslin_filtered, 'Nepal', nep_variables_for_analysis, groups_to_analyse)
malawi_maaslin_stats$volcano_plot / bangladesh_maaslin_stats$volcano_plot / nepal_maaslin_stats$volcano_plot

nrow(malawi_maaslin_stats$maaslin_results_sig)
## [1] 296
nrow(bangladesh_maaslin_stats$maaslin_results_sig)
## [1] 23
nrow(nepal_maaslin_stats$maaslin_results_sig)
## [1] 0
There were 296 species significantly (q-val < 0.05) associated
with health/disease in Malawi, in Bangladesh, and in Nepal.
Combine the taxonomic maaslins, and print out the species that are
sig in both and share directions.
Because sequencing run and participant type are totally confounded
for Bangladesh, need to exclude sequencing run from the final model for
Bangladesh (otherwise, wipes out the signals).
between acute <-> health.
###Â associated at both sites
combined_results <- run_combine_maaslins(bangladesh_taxonomic_maaslin_filtered, malawi_taxonomic_maaslin_filtered, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'metaphlan', maaslin_taxonomic_output_root_folder)
combined_results$positive_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
|
feature
|
Species
|
Coefficient Bangladesh
|
Standard Error Bangladesh
|
Q-value Bangladesh
|
Coefficient Malawi
|
Standard Error Malawi
|
Q-value Malawi
|
|
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Clostridiaceae.g__Clostridium.s__Clostridium_SGB6179
|
s__Clostridium_SGB6179
|
8.79
|
1.34
|
4.25e-05
|
3.11
|
0.936
|
0.0286
|
|
k__Bacteria.p__Bacteroidetes.c__Bacteroidia.o__Bacteroidales.f__Prevotellaceae.g__Prevotella.s__Prevotella_copri_clade_A
|
s__Prevotella_copri_clade_A
|
4.54
|
0.978
|
0.00971
|
7.64
|
1.25
|
1.21e-05
|
|
k__Bacteria.p__Firmicutes.c__Negativicutes.o__Veillonellales.f__Veillonellaceae.g__GGB4266.s__GGB4266_SGB5809
|
s__GGB4266_SGB5809
|
4.58
|
1.09
|
0.0147
|
6.94
|
0.981
|
4.38e-07
|
|
k__Bacteria.p__Proteobacteria.c__Gammaproteobacteria.o__Pasteurellales.f__Pasteurellaceae.g__Haemophilus.s__Haemophilus_parainfluenzae
|
s__Haemophilus_parainfluenzae
|
3.64
|
0.979
|
0.03
|
6.92
|
0.953
|
2.26e-07
|
|
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Peptostreptococcaceae.g__Romboutsia.s__Romboutsia_timonensis
|
s__Romboutsia_timonensis
|
4.52
|
1.25
|
0.0386
|
5.07
|
0.903
|
5.91e-05
|
combined_results$negative_coef %>% filter(grepl('^s', lowest_taxonomic_level)) %>%
select(!c(metadata, value, N_bang, N.not.0_bang, pval_bang, N_mal, N.not.0_mal, pval_mal)) %>%
rename(Species = lowest_taxonomic_level, `Coefficient Bangladesh` = coef_bang, `Standard Error Bangladesh` = stderr_bang, `Q-value Bangladesh` = qval_bang, `Coefficient Malawi` = coef_mal, `Standard Error Malawi` = stderr_mal, `Q-value Malawi` = qval_mal) %>%
dplyr::mutate_if(is.numeric, funs(as.character(signif(., 3)))) %>%
kbl() %>% kable_styling()
|
feature
|
Species
|
Coefficient Bangladesh
|
Standard Error Bangladesh
|
Q-value Bangladesh
|
Coefficient Malawi
|
Standard Error Malawi
|
Q-value Malawi
|
|
k__Bacteria.p__Firmicutes.c__Clostridia.o__Eubacteriales.f__Lachnospiraceae.g__Lachnospiraceae_unclassified.s__Lachnospiraceae_bacterium
|
s__Lachnospiraceae_bacterium
|
-3.05
|
0.838
|
0.0366
|
-2.87
|
0.785
|
0.0135
|
nrow(combined_results$mwi_maaslin_only)
## [1] 290
nrow(combined_results$bang_maaslin_only)
## [1] 17
There were 290 species significantly (q-val < 0.05) associated
with health/disease in Malawi only and 17 in Bangladesh only.
###Â associated at only one site
The ones associated at only one site are written out to a file, you
can look at them manually there.
plots of species of
interest
strataa_metaphlan_data_longer <- strataa_metaphlan_data %>% mutate(feature = rownames(strataa_metaphlan_data)) %>% pivot_longer(!c(feature, lowest_taxonomic_level), names_to = "SampleID", values_to = "prevalence")
strataa_metaphlan_data_longer_meta <- strataa_metaphlan_data_longer %>% left_join(strataa_metaphlan_metadata, by = c("SampleID" = "SampleID"))
pc <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Prevotella_copri_clade_A')

cs <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Clostridium_SGB6179')

SGB5809 <-run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__GGB4266_SGB5809')

hp <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Haemophilus_parainfluenzae')

rt <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Romboutsia_timonensis')

lb <- run_plot_species_of_interest(strataa_metaphlan_data_longer_meta, 's__Lachnospiraceae_bacterium')

# pc + cs + SGB5809 + hp + rt + lb
P. copri clades
Print the results for all the P. copri clades
all_taxa_maaslin <- combined_maaslins$all_features
p_copri_maaslin <- all_taxa_maaslin %>% filter(grepl('_Prevotella_copri_', feature)) %>% filter(qval_bang < 0.05 | qval_mal < 0.05)
write_csv(p_copri_maaslin, file.path(maaslin_taxonomic_output_root_folder, 'combined_maaslins_p_copri.csv'))
maaslin functional
groups
Functional groups associated with health
bang_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
mwi_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions", "sequencing_lane")
# nep_variables_for_analysis <- c("Group", "Sex", "Age", "Antibiotics_taken_before_sampling_yes_no_assumptions")
groups_to_analyse <- c('Acute_Typhi', 'Control_HealthySerosurvey')
bang_func_maaslin <- read_in_maaslin('Bangladesh', groups_to_analyse, bang_variables_for_analysis, 'bigmap')
mal_func_maaslin <- read_in_maaslin('Malawi', groups_to_analyse, mwi_variables_for_analysis, 'bigmap')
combined_results <- run_combine_maaslins(bang_func_maaslin, mal_func_maaslin, bang_variables_for_analysis, mwi_variables_for_analysis, groups_to_analyse, 'bigmap', maaslin_functional_output_root_folder)
combined_results$positive_coef %>% kbl() %>% kable_styling()
|
MGC_class
|
Species
|
feature
|
metadata
|
value
|
coef_bang
|
stderr_bang
|
N_bang
|
N.not.0_bang
|
pval_bang
|
qval_bang
|
coef_mal
|
stderr_mal
|
N_mal
|
N.not.0_mal
|
pval_mal
|
qval_mal
|
|
succinate2propionate
|
Dialister_invisus_DSM_15470_genomic_scaffold
|
gb.GG698602.1.region002.GC_DNA..Entryname.succinate2propionate..OS.Dialister_invisus_DSM_15470_genomic_scaffold..SMASHregion.region002..NR.1
|
Group
|
Control_HealthySerosurvey
|
4.487491
|
0.7685363
|
80
|
57
|
0.0000001
|
0.0002378
|
4.319261
|
0.9371688
|
55
|
44
|
0.0000334
|
0.0019648
|
|
Pyruvate2acetate.formate
|
Prevotella_copri_strain_AM22.19
|
gb.QRIF01000001.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.7
|
Group
|
Control_HealthySerosurvey
|
4.947861
|
0.8647077
|
80
|
73
|
0.0000002
|
0.0002593
|
4.763907
|
1.2291770
|
55
|
50
|
0.0003426
|
0.0122273
|
|
TPP_AA_metabolism
|
Prevotella_sp._S7_MS_2
|
gb.JRPT01000001.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Prevotella_sp._S7_MS_2..SMASHregion.region001..NR.11..BG.2
|
Group
|
Control_HealthySerosurvey
|
4.861273
|
0.8899833
|
80
|
75
|
0.0000006
|
0.0003811
|
5.135414
|
1.1610044
|
55
|
52
|
0.0000610
|
0.0031362
|
|
Arginine2_Hcarbonate
|
.Clostridium._sordellii_genome_assembly
|
gb.LN679998.1.region007.GC_DNA..Entryname.Arginine2_Hcarbonate..OS..Clostridium._sordellii_genome_assembly..SMASHregion.region007..NR.2
|
Group
|
Control_HealthySerosurvey
|
4.586442
|
0.8235724
|
80
|
62
|
0.0000004
|
0.0003811
|
3.511901
|
0.7321910
|
55
|
42
|
0.0000181
|
0.0011569
|
|
Others_HGD_unassigned
|
Prevotella_copri_strain_AF24.12
|
gb.QRVA01000039.1.region001.GC_DNA..Entryname.Others_HGD_unassigned..OS.Prevotella_copri_strain_AF24.12..SMASHregion.region001..NR.6
|
Group
|
Control_HealthySerosurvey
|
4.723351
|
0.8622062
|
80
|
74
|
0.0000005
|
0.0003811
|
5.742395
|
1.0935772
|
55
|
51
|
0.0000040
|
0.0003585
|
|
Pyruvate2acetate.formate
|
Prevotella_sp._AM34.19LB
|
gb.QSIG01000002.1.region002.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Prevotella_sp._AM34.19LB..SMASHregion.region002..NR.3
|
Group
|
Control_HealthySerosurvey
|
4.247639
|
0.7826518
|
80
|
73
|
0.0000007
|
0.0003811
|
3.278032
|
0.9556571
|
55
|
48
|
0.0013025
|
0.0335603
|
|
TPP_fatty_acids
|
Prevotella_copri_strain_AM22.19
|
gb.QRIF01000003.1.region001.GC_DNA..Entryname.TPP_fatty_acids..OS.Prevotella_copri_strain_AM22.19..SMASHregion.region001..NR.6
|
Group
|
Control_HealthySerosurvey
|
5.074953
|
0.9522039
|
80
|
72
|
0.0000010
|
0.0004225
|
5.334982
|
1.0684820
|
55
|
52
|
0.0000094
|
0.0007055
|
|
Rnf_complex
|
Prevotella_copri_strain_AF38.11
|
gb.QROP01000022.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Prevotella_copri_strain_AF38.11..SMASHregion.region001..NR.6
|
Group
|
Control_HealthySerosurvey
|
4.736879
|
0.9139504
|
80
|
72
|
0.0000018
|
0.0006485
|
5.228857
|
1.1248300
|
55
|
51
|
0.0000293
|
0.0017618
|
|
PFOR_II_pathway
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
4.841984
|
0.9834626
|
80
|
63
|
0.0000049
|
0.0010417
|
2.979921
|
0.7187059
|
55
|
44
|
0.0001475
|
0.0062890
|
|
Others_HGD_unassigned
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region003.GC_DNA..Entryname.Others_HGD_unassigned..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region003..NR.1
|
Group
|
Control_HealthySerosurvey
|
4.591409
|
0.9348343
|
80
|
67
|
0.0000052
|
0.0010472
|
2.761655
|
0.5584659
|
55
|
39
|
0.0000110
|
0.0008116
|
|
porA
|
Prevotella_copri_strain_AF10.17
|
gb.QSAV01000013.1.region001.GC_DNA..Entryname.porA..OS.Prevotella_copri_strain_AF10.17..SMASHregion.region001..NR.6
|
Group
|
Control_HealthySerosurvey
|
4.459775
|
0.9181644
|
80
|
73
|
0.0000063
|
0.0012403
|
5.078009
|
1.1688664
|
55
|
51
|
0.0000786
|
0.0038673
|
|
Rnf_complex
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region002.GC_DNA..Entryname.Rnf_complex..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region002..NR.1
|
Group
|
Control_HealthySerosurvey
|
4.363720
|
0.9569944
|
80
|
60
|
0.0000196
|
0.0031040
|
2.830776
|
0.6216771
|
55
|
40
|
0.0000400
|
0.0022830
|
|
PFOR_II_pathway
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region004.GC_DNA..Entryname.PFOR_II_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region004..NR.1
|
Group
|
Control_HealthySerosurvey
|
4.389654
|
0.9675717
|
80
|
69
|
0.0000213
|
0.0031830
|
3.365707
|
0.8138609
|
55
|
45
|
0.0001526
|
0.0064576
|
|
Formate_dehydrogenase
|
Haemophilus_sp._HMSC61B11_genomic_scaffold
|
gb.KV837963.1.region001.GC_DNA..Entryname.Formate_dehydrogenase..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5
|
Group
|
Control_HealthySerosurvey
|
3.408756
|
0.7538940
|
80
|
47
|
0.0000225
|
0.0032720
|
4.891279
|
0.9853786
|
55
|
45
|
0.0000104
|
0.0007674
|
|
Pyruvate2acetate.formate.Acetyl.CoA_pathway
|
Romboutsia_ilealis_strain_CRIB_genome
|
gb.LN555523.1.region005.GC_DNA..Entryname.Pyruvate2acetate.formate.Acetyl.CoA_pathway..OS.Romboutsia_ilealis_strain_CRIB_genome..SMASHregion.region005..NR.1
|
Group
|
Control_HealthySerosurvey
|
4.255279
|
0.9998841
|
80
|
74
|
0.0000595
|
0.0072007
|
4.577294
|
1.1095387
|
55
|
51
|
0.0001575
|
0.0065929
|
|
Pyruvate2acetate.formate
|
Clostridium_carboxidivorans
|
gb.CP011803.1.region013.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_carboxidivorans..SMASHregion.region013..NR.1
|
Group
|
Control_HealthySerosurvey
|
3.692280
|
0.8904588
|
80
|
62
|
0.0000880
|
0.0095422
|
3.111543
|
0.7615447
|
55
|
41
|
0.0001784
|
0.0073319
|
|
OD_eut_pdu_related.PFOR_II_pathway
|
Paeniclostridium_sordellii_strain_AM370
|
gb.CP014150.1.region011.GC_DNA..Entryname.OD_eut_pdu_related.PFOR_II_pathway..OS.Paeniclostridium_sordellii_strain_AM370..SMASHregion.region011..NR.4
|
Group
|
Control_HealthySerosurvey
|
2.611181
|
0.6285831
|
80
|
48
|
0.0000856
|
0.0095422
|
3.944312
|
0.5736462
|
55
|
34
|
0.0000000
|
0.0000046
|
|
TPP_AA_metabolism.Arginine2putrescine
|
Haemophilus_sp._HMSC61B11_genomic_scaffold
|
gb.KV837955.1.region001.GC_DNA..Entryname.TPP_AA_metabolism.Arginine2putrescine..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5
|
Group
|
Control_HealthySerosurvey
|
3.928049
|
0.9505741
|
80
|
52
|
0.0000925
|
0.0095901
|
4.391415
|
0.9179545
|
55
|
49
|
0.0000188
|
0.0011989
|
|
Respiratory_glycerol
|
Haemophilus_parainfluenzae_HK2019
|
gb.AJTC01000042.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Haemophilus_parainfluenzae_HK2019..SMASHregion.region001..NR.5
|
Group
|
Control_HealthySerosurvey
|
3.546442
|
0.8817680
|
80
|
55
|
0.0001363
|
0.0135794
|
5.139593
|
0.9752294
|
55
|
47
|
0.0000037
|
0.0003443
|
|
Pyruvate2acetate.formate
|
Haemophilus_parainfluenzae_HK262
|
gb.AJMW01000041.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.5
|
Group
|
Control_HealthySerosurvey
|
3.168962
|
0.7913820
|
80
|
49
|
0.0001450
|
0.0136983
|
5.084917
|
1.0124480
|
55
|
47
|
0.0000085
|
0.0006570
|
|
TPP_AA_metabolism
|
Clostridium_celatum_DSM_1785_genomic_scaffold
|
gb.KB291615.1.region002.GC_DNA..Entryname.TPP_AA_metabolism..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region002..NR.1
|
Group
|
Control_HealthySerosurvey
|
3.243306
|
0.8125781
|
80
|
49
|
0.0001517
|
0.0137569
|
2.499383
|
0.7198030
|
55
|
35
|
0.0011514
|
0.0303032
|
|
Arginine2putrescine.Putrescine2spermidine
|
Romboutsia_sp._Frifi_strain_FRIFI_genome
|
gb.LN650648.1.region002.GC_DNA..Entryname.Arginine2putrescine.Putrescine2spermidine..OS.Romboutsia_sp._Frifi_strain_FRIFI_genome..SMASHregion.region002..NR.1
|
Group
|
Control_HealthySerosurvey
|
2.663903
|
0.6834368
|
80
|
49
|
0.0002095
|
0.0177351
|
2.571396
|
0.5227425
|
55
|
37
|
0.0000120
|
0.0008682
|
|
Pyruvate2acetate.formate
|
Haemophilus_haemolyticus_HK386
|
gb.AJSV01000030.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_haemolyticus_HK386..SMASHregion.region001..NR.2
|
Group
|
Control_HealthySerosurvey
|
2.378368
|
0.6314161
|
80
|
25
|
0.0003269
|
0.0233914
|
5.021323
|
0.7241403
|
55
|
37
|
0.0000000
|
0.0000040
|
|
Pyruvate2acetate.formate
|
Haemophilus_aegyptius_ATCC_11116_genomic_scaffold
|
gb.GL878527.1.region001.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Haemophilus_aegyptius_ATCC_11116_genomic_scaffold..SMASHregion.region001..NR.2
|
Group
|
Control_HealthySerosurvey
|
2.158465
|
0.5739901
|
80
|
29
|
0.0003339
|
0.0235574
|
3.518440
|
0.6789089
|
55
|
37
|
0.0000050
|
0.0004277
|
|
acetate2butyrate
|
Clostridium_botulinum_B_str._Eklund
|
gb.CP001056.1.region002.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region002..NR.5..BG.2
|
Group
|
Control_HealthySerosurvey
|
2.416257
|
0.6459628
|
80
|
33
|
0.0003569
|
0.0245034
|
2.529664
|
0.7542482
|
55
|
33
|
0.0016248
|
0.0395537
|
|
OD_fatty_acids
|
Dialister_succinatiphilus_YIT_11850_genomic_scaffold
|
gb.JH591188.1.region001.GC_DNA..Entryname.OD_fatty_acids..OS.Dialister_succinatiphilus_YIT_11850_genomic_scaffold..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
2.937359
|
0.7931904
|
80
|
40
|
0.0004043
|
0.0264919
|
2.542256
|
0.5789555
|
55
|
36
|
0.0000676
|
0.0034168
|
|
Pyruvate2acetate.formate
|
Clostridium_butyricum_strain_JKY6D1_chromosome
|
gb.CP013352.1.region004.GC_DNA..Entryname.Pyruvate2acetate.formate..OS.Clostridium_butyricum_strain_JKY6D1_chromosome..SMASHregion.region004..NR.6
|
Group
|
Control_HealthySerosurvey
|
3.461246
|
0.9377451
|
80
|
59
|
0.0004211
|
0.0265098
|
3.345147
|
0.9358449
|
55
|
39
|
0.0008517
|
0.0240541
|
|
TPP_AA_metabolism
|
Haemophilus_sputorum_HK_2154
|
gb.ALJP01000004.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_sputorum_HK_2154..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
2.955534
|
0.8121614
|
80
|
40
|
0.0005000
|
0.0298840
|
4.518892
|
0.8662388
|
55
|
39
|
0.0000045
|
0.0003922
|
|
Fumarate2succinate
|
Haemophilus_sp._HMSC61B11_genomic_scaffold
|
gb.KV838003.1.region001.GC_DNA..Entryname.Fumarate2succinate..OS.Haemophilus_sp._HMSC61B11_genomic_scaffold..SMASHregion.region001..NR.5
|
Group
|
Control_HealthySerosurvey
|
3.338659
|
0.9243056
|
80
|
44
|
0.0005465
|
0.0311936
|
4.623872
|
1.0122075
|
55
|
46
|
0.0000382
|
0.0021980
|
|
Respiratory_glycerol
|
Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold
|
gb.KE952617.1.region001.GC_DNA..Entryname.Respiratory_glycerol..OS.Aggregatibacter_sp._oral_taxon_458_str._W10330_genomic_scaffold..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
2.906426
|
0.8078251
|
80
|
41
|
0.0005726
|
0.0323199
|
4.637692
|
0.9258834
|
55
|
44
|
0.0000089
|
0.0006780
|
|
TPP_AA_metabolism
|
Haemophilus_pittmaniae_HK_85
|
gb.AFUV01000010.1.region001.GC_DNA..Entryname.TPP_AA_metabolism..OS.Haemophilus_pittmaniae_HK_85..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
2.425728
|
0.6864843
|
80
|
40
|
0.0007059
|
0.0352706
|
5.215879
|
0.8956435
|
55
|
44
|
0.0000006
|
0.0000817
|
|
PFOR_II_pathway
|
Clostridium_celatum_DSM_1785_genomic_scaffold
|
gb.KB291660.1.region001.GC_DNA..Entryname.PFOR_II_pathway..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
3.513096
|
0.9944875
|
80
|
64
|
0.0007082
|
0.0352706
|
3.767830
|
0.9815067
|
55
|
41
|
0.0003837
|
0.0132101
|
|
acetate2butyrate
|
Clostridium_celatum_DSM_1785_genomic_scaffold
|
gb.KB291615.1.region001.GC_DNA..Entryname.acetate2butyrate..OS.Clostridium_celatum_DSM_1785_genomic_scaffold..SMASHregion.region001..NR.1
|
Group
|
Control_HealthySerosurvey
|
3.410399
|
0.9814122
|
80
|
64
|
0.0008526
|
0.0397344
|
3.413463
|
0.9314921
|
55
|
43
|
0.0006509
|
0.0197440
|
|
Rnf_complex
|
Haemophilus_parainfluenzae_HK262
|
gb.AJMW01000064.1.region001.GC_DNA..Entryname.Rnf_complex..OS.Haemophilus_parainfluenzae_HK262..SMASHregion.region001..NR.6..BG.2
|
Group
|
Control_HealthySerosurvey
|
2.804854
|
0.8137384
|
80
|
43
|
0.0009328
|
0.0417717
|
5.002957
|
0.9924891
|
55
|
44
|
0.0000080
|
0.0006358
|
|
OD_fatty_acids.PFOR_II_pathway
|
Clostridium_botulinum_B_str._Eklund
|
gb.CP001056.1.region005.GC_DNA..Entryname.OD_fatty_acids.PFOR_II_pathway..OS.Clostridium_botulinum_B_str._Eklund..SMASHregion.region005..NR.5..BG.2
|
Group
|
Control_HealthySerosurvey
|
2.572884
|
0.7716623
|
80
|
41
|
0.0013315
|
0.0497369
|
3.161364
|
0.8345813
|
55
|
31
|
0.0004481
|
0.0148068
|
combined_results$negative_coef %>% kbl() %>% kable_styling()
|
MGC_class
|
Species
|
feature
|
metadata
|
value
|
coef_bang
|
stderr_bang
|
N_bang
|
N.not.0_bang
|
pval_bang
|
qval_bang
|
coef_mal
|
stderr_mal
|
N_mal
|
N.not.0_mal
|
pval_mal
|
qval_mal
|
|
NADH_dehydrogenase_I
|
Actinomyces_viscosus_C505_genomic_scaffold
|
gb.KI391968.1.region002.GC_DNA..Entryname.NADH_dehydrogenase_I..OS.Actinomyces_viscosus_C505_genomic_scaffold..SMASHregion.region002..NR.5
|
Group
|
Control_HealthySerosurvey
|
-3.007475
|
0.5744596
|
80
|
74
|
0.0000015
|
0.0005676
|
-2.902767
|
0.7955322
|
55
|
48
|
0.0006822
|
0.0205316
|
|
Nitrate_reductase
|
Actinomyces_johnsonii_F0510_genomic_scaffold
|
gb.KE951633.1.region001.GC_DNA..Entryname.Nitrate_reductase..OS.Actinomyces_johnsonii_F0510_genomic_scaffold..SMASHregion.region001..NR.2
|
Group
|
Control_HealthySerosurvey
|
-2.757659
|
0.5439238
|
80
|
67
|
0.0000028
|
0.0008322
|
-2.257511
|
0.6572948
|
55
|
23
|
0.0012858
|
0.0332056
|
|
Rnf_complex.TPP_AA_metabolism
|
Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic
|
gb.DS499548.1.region001.GC_DNA..Entryname.Rnf_complex.TPP_AA_metabolism..OS.Eubacterium_siraeum_DSM_15702_Scfld_03_40_genomic..SMASHregion.region001..NR.2
|
Group
|
Control_HealthySerosurvey
|
-2.443337
|
0.7272441
|
80
|
20
|
0.0012292
|
0.0469508
|
-4.287272
|
1.2309921
|
55
|
43
|
0.0011166
|
0.0297739
|
combined_results$positive_coef %>% mutate(split_species = str_split(Species, '_')) %>% mutate(genus = sapply(split_species, "[[", 1)) %>% mutate(specific = sapply(split_species, "[[", 2)) %>% mutate(clean_species = paste(genus, specific, sep = ' ')) %>% select(-split_species, -genus, -specific) %>% relocate(clean_species, .after = Species) %>% group_by(MGC_class) %>% arrange(Species) %>% summarise(n = n(), Species = paste(clean_species, collapse = ', ')) %>% arrange(desc(n), MGC_class) %>% kbl() %>% kable_styling()
|
MGC_class
|
n
|
Species
|
|
Pyruvate2acetate.formate
|
7
|
Clostridium butyricum, Clostridium carboxidivorans, Haemophilus
aegyptius, Haemophilus haemolyticus, Haemophilus parainfluenzae,
Prevotella copri, Prevotella sp.
|
|
TPP_AA_metabolism
|
4
|
Clostridium celatum, Haemophilus pittmaniae, Haemophilus sputorum,
Prevotella sp.
|
|
PFOR_II_pathway
|
3
|
Clostridium celatum, Romboutsia ilealis, Romboutsia ilealis
|
|
Rnf_complex
|
3
|
Haemophilus parainfluenzae, Prevotella copri, Romboutsia ilealis
|
|
Others_HGD_unassigned
|
2
|
Prevotella copri, Romboutsia ilealis
|
|
Respiratory_glycerol
|
2
|
Aggregatibacter sp., Haemophilus parainfluenzae
|
|
acetate2butyrate
|
2
|
Clostridium botulinum, Clostridium celatum
|
|
Arginine2_Hcarbonate
|
1
|
.Clostridium. sordellii
|
|
Arginine2putrescine.Putrescine2spermidine
|
1
|
Romboutsia sp.
|
|
Formate_dehydrogenase
|
1
|
Haemophilus sp.
|
|
Fumarate2succinate
|
1
|
Haemophilus sp.
|
|
OD_eut_pdu_related.PFOR_II_pathway
|
1
|
Paeniclostridium sordellii
|
|
OD_fatty_acids
|
1
|
Dialister succinatiphilus
|
|
OD_fatty_acids.PFOR_II_pathway
|
1
|
Clostridium botulinum
|
|
Pyruvate2acetate.formate.Acetyl.CoA_pathway
|
1
|
Romboutsia ilealis
|
|
TPP_AA_metabolism.Arginine2putrescine
|
1
|
Haemophilus sp.
|
|
TPP_fatty_acids
|
1
|
Prevotella copri
|
|
porA
|
1
|
Prevotella copri
|
|
succinate2propionate
|
1
|
Dialister invisus
|
write_csv(combined_results$positive_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_positive_coef.csv'))
write_csv(combined_results$negative_coef, file.path(maaslin_functional_output_root_folder, 'combined_maaslins_negative_coef.csv'))
groups_to_analyse <- c('Carrier', 'Control_HealthySerosurvey')
mwi_variables_for_analysis <- c("Group", "Sex", "Age")
bang_variables_for_analysis <- c("Group", "Sex", "Age")
run_combine_maaslins(groups_to_analyse, mwi_variables_for_analysis, bang_variables_for_analysis, maaslin_functional_output_root_folder, 'bigmap')